Import necessary packages
packages <- c(
"dplyr",
"ggplot2",
"plotly",
"data.table",
"readr"
)
lapply(packages, require, character.only = T)
[[1]]
[1] TRUE
[[2]]
[1] TRUE
[[3]]
[1] TRUE
[[4]]
[1] TRUE
[[5]]
[1] TRUE
Import the economist data
df <- read_csv("datasets/Economist_Assignment_Data.csv")
New names:Rows: 173 Columns: 6── Column specification ──────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Country, Region
dbl (4): ...1, HDI.Rank, HDI, CPI
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# drop first column
df <- df[, -1]
Explore the data
head(df)
glimpse(df)
Rows: 173
Columns: 5
$ Country <chr> "Afghanistan", "Albania", "Algeria", "Angola", "Argentina", "Armenia", "Australia",…
$ HDI.Rank <dbl> 172, 70, 96, 148, 45, 86, 2, 19, 91, 53, 42, 146, 47, 65, 18, 167, 141, 108, 74, 11…
$ HDI <dbl> 0.398, 0.739, 0.698, 0.486, 0.797, 0.716, 0.929, 0.885, 0.700, 0.771, 0.806, 0.500,…
$ CPI <dbl> 1.5, 3.1, 2.9, 2.0, 3.0, 2.6, 8.8, 7.8, 2.4, 7.3, 5.1, 2.7, 7.8, 2.4, 7.5, 3.0, 5.7…
$ Region <chr> "Asia Pacific", "East EU Cemt Asia", "MENA", "SSA", "Americas", "East EU Cemt Asia"…
summary(df)
Country HDI.Rank HDI CPI Region
Length:173 Min. : 1.00 Min. :0.2860 Min. :1.500 Length:173
Class :character 1st Qu.: 47.00 1st Qu.:0.5090 1st Qu.:2.500 Class :character
Mode :character Median : 96.00 Median :0.6980 Median :3.200 Mode :character
Mean : 95.28 Mean :0.6581 Mean :4.052
3rd Qu.:143.00 3rd Qu.:0.7930 3rd Qu.:5.100
Max. :187.00 Max. :0.9430 Max. :9.500
Transform the data
df <- df %>% mutate(Country = factor(Country), Region = factor(Region))
Check data after tranform
head(df)
glimpse(df)
Rows: 173
Columns: 5
$ Country <fct> Afghanistan, Albania, Algeria, Angola, Argentina, Armenia, Australia, Austria, Azer…
$ HDI.Rank <dbl> 172, 70, 96, 148, 45, 86, 2, 19, 91, 53, 42, 146, 47, 65, 18, 167, 141, 108, 74, 11…
$ HDI <dbl> 0.398, 0.739, 0.698, 0.486, 0.797, 0.716, 0.929, 0.885, 0.700, 0.771, 0.806, 0.500,…
$ CPI <dbl> 1.5, 3.1, 2.9, 2.0, 3.0, 2.6, 8.8, 7.8, 2.4, 7.3, 5.1, 2.7, 7.8, 2.4, 7.5, 3.0, 5.7…
$ Region <fct> Asia Pacific, East EU Cemt Asia, MENA, SSA, Americas, East EU Cemt Asia, Asia Pacif…
summary(df)
Country HDI.Rank HDI CPI Region
Afghanistan: 1 Min. : 1.00 Min. :0.2860 Min. :1.500 Americas :31
Albania : 1 1st Qu.: 47.00 1st Qu.:0.5090 1st Qu.:2.500 Asia Pacific :30
Algeria : 1 Median : 96.00 Median :0.6980 Median :3.200 East EU Cemt Asia:18
Angola : 1 Mean : 95.28 Mean :0.6581 Mean :4.052 EU W. Europe :30
Argentina : 1 3rd Qu.:143.00 3rd Qu.:0.7930 3rd Qu.:5.100 MENA :18
Armenia : 1 Max. :187.00 Max. :0.9430 Max. :9.500 SSA :46
(Other) :167
Plot graph from the data
pl <- df %>%
ggplot(aes(x = CPI, y = HDI, text = paste("Country:", Country))) +
geom_point(aes(color = Region), size = 3) +
geom_smooth(aes(group = 1),
color = "red",
se = F,
method = "lm",
formula = y ~ log(x)) +
scale_x_continuous(name = "Corruption Perceptions Index, 2011 (10 = least corrupt)",
limits = c(1, 10),
breaks = 1:10) +
scale_y_continuous(name = "Human Development Index, 2011 (1 = best)",
limits = c(0.2, 1),
breaks = seq(0.2, 1, 0.1)) +
ggtitle("Corruption and human development") +
theme_bw()
Create an interactive graph
ggplotly(pl)